%reload_ext autoreload
%autoreload 2
import sys
sys.path.append("../Functions")
import warnings
warnings.filterwarnings(action = 'ignore')
import custom_functions
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.patches as mpatch
from matplotlib_venn import venn2
import plotly.express as px
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from sklearn.cluster import KMeans
plt.style.use('ggplot')
## Change title size of a plot
mpl.rcParams['axes.titlesize'] = 22
# Figure size
mpl.rc("figure", figsize = (15,7))
## Change label size(x and y) of a plot
mpl.rcParams['axes.labelsize'] = 18
## Change xticks size of a plot
mpl.rcParams['xtick.labelsize'] = 16
## Change yticks size of a plot
mpl.rcParams['ytick.labelsize'] = 16
# Reading the excel file, sheet = Coles Transactions
data = pd.read_excel("DATA/Simulated Coles Data.xlsx", sheet_name = "Coles Transactions")
# Shape of the dataframe
data.shape
Data has 58,100 observations and 53 variables.
# Name of the columns
data.columns
# Getting first 5 rows of the data
data.head()
# Getting summary of all the columns
data.describe()
# Information about the dataframe
# null_counts = true, will not count rows which have missing values in it.
data.info(null_counts = True)
The columns having count less than 58,100 means those columns have missing values in them. Most of the columns are of type integer and float.
# Getting number of missing values in each column
data.isna().sum()
Most of the missing values are in column PostCode which is 9792 and second highest number of missing values is in cereal column which is 9.
print("Total number of missing values is :", data.isna().sum().sum())
# Plotting the missing values
plt.figure(figsize = (20,7))
sns.heatmap(data.isna())
plt.show()
On the y-axis the numbers represent the row number of the observation and x-axis represents the column. Colorbar on the right represents the intensity of the missing value. This graph above confirms the statement we just made before about the missing values distribution.
data.describe()
# Checking if there are missing values or not
data["ReceiptID"].isna().sum()
len(data["ReceiptID"])
# Getting length of unique ReceiptID
len(set(data["ReceiptID"]))
9 duplicate entries in Receipt ID.
data[data.duplicated("ReceiptID", keep = False)].sort_values("ReceiptID")
Here only ReceiptID has duplicate values not other columns that means transactions are different. We won't be needing ReceiptID for our analysis, so we are not dealing with this issue.
# Summary of the value column
data["Value"].describe()
# Checking if there are missing values or not
data["Value"].isna().sum()
# Visualization before data cleaning
%autoreload 2
custom_functions.create_histogram_plus_boxplot(data["Value"], "Value", color = "royalblue", ylabel = "Frequency",
xlabel = "Value in dollars" , size = (15,13))
Histogram is right skewed. Most of the values are in between 0 - 250. By looking at the boxplot we can say that there are many outliers, but only 3 of them are extreme which are \$802.0592, \\$1242.9862, \$1967.6968. These observations could be explained by saying that they could be order for a big party or an occasion. These values might cause problems when we will be doing clustering, so I will be changing them to the median value of the column Value excluding extreme outliers. Since we are changing only 3 observation it won't be affecting results of future analysis.
# Replacing extreme outliers with median value
data.loc[data["Value"] > 500, "Value"] = data.Value.median()
# Summary after Data cleaning
data["Value"].describe()
# Visualization after data cleaning
%autoreload 2
custom_functions.create_histogram_plus_boxplot(data["Value"], "Value", color = "royalblue", ylabel = "Frequency",
xlabel = "Value in dollars" , size = (15,13))
Distribution of Value is right skewed. It means generally the value of the transaction value is between \$25 - \\$120. But occurence of transactions with high value is less compare to occurence of transactions with low value is quite common. There are still some outliers but they are not extreme so I am leaving them in the data as it is.
# Summary
data["pmethod"].describe()
# Checking if there are missing values or not
data["pmethod"].isna().sum()
# Checking for invalid entries
data[data["pmethod"] > 4]
# Find trend in the missing values with homeown and pmethod as well as nchildren
data[data["pmethod"] > 4].sort_values("pmethod", ascending = False).iloc[:30,:]
There are 97 entries which are greater than 4(i.e invalid). We can also see that observations which have invalid values for pmethod also have invalid values for homeown(> 3)[homeown 1 : Yes, 2 : No, 3 : Unknown] and nchildren. The values are increasing at same rate(increamented one by each row). We will remove these observation as the invalid values persist in more than one column.
# Filtering out the data
data = data[data["pmethod"] < 5]
# Converting the numeric field to categorical
data["pmethod"] = data["pmethod"].astype("category")
# Creating a new variable replacing categorical codes to actual category names
data["pmethod_processed"] = data["pmethod"].values.map({1 : "Cash", 2 : "Card", 3 : "EFTPOS", 4 : "Other"})
# Gettting the frequency of the categories
pmethod_count_series = data["pmethod_processed"].value_counts()
pmethod_count_series
There are 4 unique values for pmethod. Most common of them is card with frequency of 24720.
# Visualization after data cleaning
%autoreload 2
custom_functions.barplot(pmethod_count_series, "Payment method", "Counts", "Payment Methods",
color=["red","green", "blue", "grey"], labels = ["Card", "EFTPOS", "Cash", "Other"])
Most common form of payment is via card and EFTPOS.
# Summary
data["sex"].describe()
# Checking if there are missing values or not
data["sex"].isna().sum()
No invalid entries
# Converting sex into categorical variable
data["sex"] = data["sex"].astype("category")
# Creating a new variable replacing categorical codes to actual category names
data["sex_processed"] = data["sex"].map({1 : "Male", 2 : "Female"})
# Gettting the frequency of the categories
sex_counts_series = data["sex_processed"].value_counts()
sex_counts_series
# Donut Chart
plt.pie(sex_counts_series, colors = ["pink", "skyblue"], labels = ["Female", "Male"],
autopct = '%1.1f%%', startangle = 90, pctdistance = 0.80, textprops = {"fontsize" : 18})
plt.gca().add_artist(plt.Circle((0, 0), 0.60, fc = 'white'));
plt.title("Gender Proportions")
plt.show()
# Summary
data["homeown"].describe()
# Checking if there are missing values or not
data["homeown"].isna().sum()
# Filtering the data
data[data["homeown"] > 3]
For above two observations the value of homeown column is invalid. Also the values for nchildren is quite unusual so I will be removing these two observations.
# Filtering out the data
data = data[data["homeown"] < 4]
# Converting the homeown from integer to categorical variable
data["homeown"] = data["homeown"].astype("category")
# Creating a new variable replacing categorical codes to actual category names
data["homeown_processed"] = data["homeown"].map({1 : "Yes", 2 : "No", 3: "Unknown"})
# Gettting the frequency of the categories
homeown_count_series = data["homeown_processed"].value_counts()
homeown_count_series
%autoreload 2
custom_functions.barplot(homeown_count_series, "Does customer own a home?", "Counts", "Homeown",
color=["red", "green", "blue"], labels = ["Yes", "No", "Unknown"])
Around 42,000 of the customers own a house where around 15,000 of the customers are maybe renting the place they are staying.
# Summary
data["income"].describe()
# Checking if there are missing values or not
data["income"].isna().sum()
# Checking for missing values
data[data["income"].isna()]
# Replacing missing value with mean value
data.loc[data["income"].isna(), "income"] = data["income"].median()
%autoreload 2
custom_functions.create_histogram_plus_boxplot(data["income"], "Income", color = "seagreen", ylabel = "Frequency",
xlabel = "Income in dollars" , size = (15,13))
Distribution of income is right skewed(as expected). Number of people with high salaries are low compare to number of people with low salaries. We can see there are many outliers. But there is one extreme outlier where income is around \$650235.25. We will replace this value with median as well because it will affect the clustering results.
# Replacing the extreme outlier with the median value
data.loc[data["income"] == data["income"].max(), "income"] = data["income"].median()
data["income"].describe()
%autoreload 2
custom_functions.create_histogram_plus_boxplot(data["income"], "Income", color = "seagreen", ylabel = "Frequency",
xlabel = "Income in dollars" , size = (15,13))
The distribution somewhat looks normal to me. People with extreme salaries(either low or high) are less compare to median salary which is around \$70k. There are still outliers in the data but since distribution somewhat looks like normal(usually found in real life) I won't be dealing with them.
# summary
data["age"].describe()
# Checking if there are missing values or not
data["age"].isna().sum()
# Checking the missing value observation
data[data["age"].isna()]
# Replacing the missing value with the median
data.loc[data.age.isna(), "age"] = data["age"].median()
# Checking if there are missing values or not
data["age"].isna().sum()
%autoreload 2
custom_functions.create_histogram_plus_boxplot(data["age"], "Age", color = "salmon", ylabel = "Frequency",
xlabel = "Age in Years" , size = (15,13))
Distribution is almost normal(slightly right skewed). The age is from 10 to 95 years. There are many outliers but none of them are extreme. Here I would like to like to explore young customers with age less than 16.
# Filtering the data
young_customers = data[(data["age"] < 16)]
young_customers
young_customers["income"].describe()
young_customers["homeown"].value_counts()
As we can see that most of the younge customers have high income and they own a house, which raises a question regarding the quality of the data. As well as for some observations value of age is in decimal points and not in whole number(We don't know the way data was collected so we can't rule out the age with decimal points as invalid). I will be removing the records where age of the custmer is < 16, because they have high income as this doesn't make any sense.
data = data.loc[data["age"] >= 16, :]
# Summary of age after cleaning
data["age"].describe()
%autoreload 2
custom_functions.create_histogram_plus_boxplot(data["age"], "Age", color = "salmon", ylabel = "Frequency",
xlabel = "Age in Years" , size = (15,13))
Distribution is almost normal(slightly right skewed). The age is from 16 to 95 years. There are many outliers but none of them are extreme.
# Summary
data["PostCode"].describe()
# Missing values
data["PostCode"].isna().sum()
Around 9788(~17%) of the values are missing, As this is significant amount of observations we can't just remove them. We will keep them as it is as this field is not useful for our analysis.
# summary
data["nchildren"].describe()
# Missing values
data["nchildren"].isna().sum()
data[data["nchildren"].isna()]
There are two missing values for number of children. As values for other variables look good, I am going to replace the missing values with median value of nchildren.
# Replacing missing values with median value
data.loc[data["nchildren"].isna(), "nchildren"] = data["nchildren"].median()
%autoreload 2
custom_functions.create_histogram_plus_boxplot(data["nchildren"], "No of Children", color = "orange", ylabel = "Frequency",
xlabel = "No of Children" , size = (15,13))
There are few outliers(nchildren > 5). The field is not useful for the analysis so I am going to leave the outliers as it is. Because as you can see that the occurence of the outliers in the data is rare. nchilren is supposed to be district not continuous, So histogram is not the good choice.
# Converting floating nchildren to category
data["nchildren_processed"] = data["nchildren"].astype("int").astype("category")
# Summary
nchildren_counts = data["nchildren_processed"].value_counts()
nchildren_counts
nchildren_counts.sort_values(inplace = True, ascending = False)
%autoreload 2
custom_functions.barplot(nchildren_counts, "Children Distribution", "Counts", "No of Children",
color = ("salmon", "lightgreen", "cornflowerblue",) * 4, labels = nchildren_counts.index)
Around 32% the customers have one child, followed by 29% of them having no child. % of customers having 4 or more children is 0.08, which is rare.
# Creating a new variable parent from the nchildren
data["parent"] = ["Yes" if n > 0 else "No" for n in data["nchildren"] ]
# Changing the type of the parent to category
data["parent"] = data["parent"].astype("category")
# Donut Chart
plt.pie(data["parent"].value_counts(), colors = ["royalblue", "orange"], labels = ["Yes", "No"],
autopct = '%1.1f%%', startangle = 90, pctdistance = 0.80, textprops = {"fontsize" : 18})
plt.gca().add_artist(plt.Circle((0, 0), 0.60, fc = 'white'));
plt.title("Parent Proportions")
plt.show()
Almost 71% of the customers are parents.
# Summary
data["fruit"].describe()
# Frequency of values
data["fruit"].value_counts()
# Filtering data
## o should be 0 by common sense
data.loc[data["fruit"] == "o", "fruit"] = 0
## Values greater than 1 should be 1
data.loc[data["fruit"] > 1, "fruit"] = 1
# Checking for missing values
data["fruit"].isna().sum()
# Summary
data["fruit"].value_counts()
# Summary
data["cannedveg"].describe()
# Frequency
data["cannedveg"].value_counts()
# Missing values
data["cannedveg"].isna().sum()
# Replacing missing value with 0, because missing value suggests that product was not purchased
data.loc[data["cannedveg"].isna(), "cannedveg"] = 0
# Summary
data["cereal"].describe()
# Frequency
data["cereal"].value_counts()
# Missing values
data["cereal"].isna().sum()
# Replacing missing value with 0, because missing value suggests that product was not purchased
data.loc[data["cereal"].isna(), "cereal"] = 0
# Summary
data["PizzaBase"].describe()
# Frequency
data["PizzaBase"].value_counts()
# Missing values
data["PizzaBase"].isna().sum()
# Replacing missing value with 0, because missing value suggests that product was not purchased
data.loc[data["PizzaBase"].isna(), "PizzaBase"] = 0
# Summary
data["milk"].describe()
# Frequency
data["milk"].value_counts()
# Missing values
data["milk"].isna().sum()
# Replacing missing value with 0, because missing value suggests that product was not purchased
data.loc[data["milk"].isna(), "milk"] = 0
# Summary
data["fruitjuice"].describe()
# Frequency
data["fruitjuice"].value_counts()
# Missing values
data["fruitjuice"].isna().sum()
# Replacing value 2 with 1, because value 2 might have been the quantity of the item
data.loc[data["fruitjuice"] == 2, "fruitjuice"] = 1
# Summary
data["confectionery"].describe()
# Frequency
data["confectionery"].value_counts()
# Missing values
data["confectionery"].isna().sum()
# Replacing missing value with 0, because missing value suggests that product was not purchased
data.loc[data["confectionery"].isna(), "confectionery"] = 0
All other basket items don't need any kind of data cleaning. I will just change the type of all items from integer to category in the next cell.
basket = ['fruit', 'freshmeat', 'dairy','MozerallaCheese', 'cannedveg', 'cereal', 'frozenmeal', 'frozendessert','PizzaBase',
'TomatoSauce', 'frozen fish', 'bread', 'milk', 'softdrink', 'fruitjuice', 'confectionery', 'fish', 'vegetables',
'icecream', 'energydrink', 'tea', 'coffee', 'laundryPowder', 'householCleaners', 'corn chips', 'Frozen yogurt',
'Chocolate', 'Olive Oil', 'Baby Food', 'Napies', 'banana', 'cat food', 'dog food', 'mince', 'sunflower Oil',
'chicken', 'vitamins', 'deodorants', 'dishwashingliquid', 'onions', 'lettuce', 'KitKat', 'TeaTowel', 'Scones']
# Changing type
for item in basket:
data[item] = data[item].astype("int").astype("category")
# Summary
print(data.loc[:,basket[:20]].describe())
print(data.loc[:,basket[20:40]].describe())
print(data.loc[:,basket[40:44]].describe())
# Getting the count that shows how many times each item was purchased out of 58001 transactions
basket_counts = data.loc[:,basket].sum().sort_values(ascending = False)
basket_counts
# Product Frequency Percentage
basket_counts/len(data) * 100
%autoreload 2
custom_functions.barplot(basket_counts, "Frequency of the items", "Counts", "Items",
color = "grey", labels = basket_counts.index, figure_size = (20,10), xrotation = 90)
The MOST purchased items are :
bread 82.852036%
milk 81.342196%
cereal 76.354517%
banana 76.186179%
lettuce 74.310159%
The LEAST purchased items are :
KitKat 1.645205%
energydrink 1.855194%
frozen fish 2.941585%
TeaTowel 3.708653%
icecream 4.361181%
data["Value"].describe()
%autoreload 2
custom_functions.create_histogram_plus_boxplot(data["Value"], "Value", color = "royalblue", ylabel = "Frequency",
xlabel = "Value in dollars" , size = (15,13))
Distribution of Value is right skewed. It means generally the value of the transaction value is between \$0.90* - \\$115. But occurence of transactions with high value is less compare to occurence of transactions with low value is quite common.
[* - ALMOST]
# Getting Value of the transaction for males and females
# Creating boolean flags
flag_female = data["sex_processed"] == 'Female'
flag_male = data["sex_processed"] == 'Male'
# Subsetting the data
female = data.loc[flag_female, :]
male = data.loc[flag_male, :]
# Summary
print("Male :\n", male["Value"].describe())
print("Female :\n", female["Value"].describe())
data["pmethod_processed"].describe()
data["pmethod_processed"].value_counts()/data.shape[0]
%autoreload 2
custom_functions.barplot(pmethod_count_series, "Payment method", "Counts", "Payment Methods",
color=["red","green", "blue", "grey"], labels = ["Card", "EFTPOS", "Cash", "Other"])
Most common form of payment is via card and EFTPOS.
data["sex_processed"].describe()
sex_counts_series = data["sex_processed"].value_counts()
sex_counts_series
# Donut Chart
plt.pie(sex_counts_series, colors = ["pink", "skyblue"], labels = ["Female", "Male"],
autopct = '%1.1f%%', startangle = 90, pctdistance = 0.80, textprops = {"fontsize" : 18})
plt.gca().add_artist(plt.Circle((0, 0), 0.60, fc = 'white'));
plt.title("Gender Proportions")
plt.show()
Almost 60% of the customers are Female and the rest of the customers are Male.
data["homeown_processed"].describe()
homeown_count_series = data["homeown_processed"].value_counts()
homeown_count_series
%autoreload 2
custom_functions.barplot(homeown_count_series, "Does customer own a home?", "Counts", "Homeown",
color=["red", "green", "blue"], labels = ["Yes", "No", "Unknown"])
Most of the customers(73%) own house(s), rest of them might have been renting the house(s).
# Transaction value for homeowners
flag_homeowners = data["homeown_processed"] == "Yes"
homeowners = data.loc[flag_homeowners, :]
print("HomeOwners : \n", homeowners["Value"].describe())
# Transaction value for not homeowners
flag_not_homeowners = data["homeown_processed"] == "No"
not_homeowners = data.loc[flag_not_homeowners, :]
print("NOT HomeOwners : \n", not_homeowners["Value"].describe())
data["income"].describe()
%autoreload 2
custom_functions.create_histogram_plus_boxplot(data["income"], "Income", color = "seagreen", ylabel = "Frequency",
xlabel = "Income in dollars" , size = (15,13))
The distribution somewhat looks normal to me. People with extreme salaries(either low or high) are less compare to median salary which is around \$70k. We can also see that there is a break after income \\$120,000 and customers with salary higher that \$120,000 is slightly high.
# Dog owners
dog_owners = data.loc[:,"dog food"].value_counts()[1]
# Cat owners
cat_owners = data.loc[:,"cat food"].value_counts()[1]
# Owns both cat and dog
flag = (data["dog food"].values == 1) & (data["cat food"].values == 1)
both_owners = flag.sum()
#### CORRECTED ####
# Dog owners
dog_owners -= both_owners
# Cat owners
cat_owners -= both_owners
# Total
total = dog_owners + cat_owners + both_owners
# First way to call the 2 group Venn diagram:
v2 = venn2(subsets = {'10': dog_owners,
'01': cat_owners,
'11': both_owners},
set_labels = ('Dog Owners', 'Cat Owners'), set_colors = ('blue', 'salmon'), alpha = 0.4)
v2.get_label_by_id('10').set_text('%d\n(%.1f%%)' % (dog_owners, dog_owners / total * 100))
v2.get_label_by_id('01').set_text('%d\n(%.1f%%)' % (cat_owners, cat_owners / total * 100))
v2.get_label_by_id('11').set_text('%d\n(%.1f%%)' % (both_owners, both_owners / total * 100))
for text in v2.set_labels:
text.set_fontsize(18)
for text in v2.subset_labels:
text.set_fontsize(20)
plt.title("Pets Insight")
plt.show()
31.37%(18076) of the all the customers have pet(s). Out of them 53% own only dogs, 37% only own cats and 9% own both cat(s) and dog(s).
# New Variable
flag_new = (data["dog food"].values == 1) | (data["cat food"].values == 1)
data["Pet Owner"] = ["Yes" if i == True else "No" for i in flag_new]
data["Pet Owner"].value_counts()
data[["Value", "income", "age", "nchildren"]].corr()
As we can see that there is not strong relationship between numerical variable.
Before we start MBA(Market Basket Analysis), I will explain few terms of MBA.
RULES
Given a rule "A -> C", A stands for antecedent(Item A) and C(Item C) stands for consequent.
ANTECEDENT SUPPORT
It computes the proportion of transactions that contain the antecedent A.
CONSEQUENT SUPPORT
It computes the support for the itemset of the consequent C.
SUPPORT
Support is used to measure the frequency (often interpreted as significance or importance) of an itemset in a database(all the transactions here).
support(A -> C) = support(A ∪ C)
The 'support' metric then computes the support of the combined itemset A ∪ C -- ''support' depends on 'antecedent support' and 'consequent support' via min('antecedent support', 'consequent support')
[We refer to an itemset as a "frequent itemset" if you support is larger than a specified minimum-support threshold.]
[Due to the downward closure property, all subsets of a frequent itemset are also frequent.]
CONFIDENCE
The confidence of a rule A -> C is the probability of seeing the consequent in a transaction given that it also contains the antecedent.
confidence(A -> C) = support(A -> C) / support(A)
This metric is not symmetric or directed; for instance, the confidence for A -> C is different than the confidence for C -> A.
[The confidence is 1 (maximal) for a rule A->C if the consequent and antecedent always occur together.]
LIFT
The lift metric is commonly used to measure how much more often the antecedent and consequent of a rule A -> C occur together than we would expect if they were statistically independent.
lift(A -> C) = confidence(A -> C) / support(C)
# Running apriori algorithm
mba = apriori(data.loc[:, basket], min_support = 0.10, use_colnames = True, low_memory = True)
# Mining rules
# min threshold for lift is 1.
rules = association_rules(mba, metric = "lift", min_threshold = 1)
# Sorting rules based on support in descending order
rules_by_supp = rules.sort_values("support", ascending = False)
# Sorting rules based on support in descending order
rules_by_conf = rules.sort_values("confidence", ascending = False)
# Sorting rules based on support in descending order
rules_by_lift = rules.sort_values("lift", ascending = False)
rules_by_supp[:10]
EXPLANATION
Bread & Cereal is the most popular combination of products being bought 63.9% of the time. The second and third most popular combinations of product are Bread & Banana(63.8%) and Milk and Cereal(62.2%). Support for these rules is high enough, so there is no need to give discounts on these combinations of the product.
These product combinations are the most obvious as we know these products(Bread, Milk, Cereal and Banana) were purchased the most and don't have much value to us.
As we know that these products were purchased the most, we can use that in many ways.
1. Coles should put newly marketed products near the shelf of these products, So that customers are exposed to the new products.
2. Coles should put these products to the end of the aisle, So that customers have to walk more in the stores, in turn they will be tempted to buy other products they see on their way to the most purchased product aisle.
rules_by_conf[:10]
EXPLANATION
We can see that the rules have high confidence(around 99%), the reason behind it is Banana. Because Banana was present in almost 76% of the transaction. It is appearing in all the rules with the high confidence. What we will do is we will remove the five most occuring products from the data and then we will run apriori algorithm again.
rules_by_lift[:10]
EXPLANATION
Same problem as I mentioned above we can see Banana in all the rules.
basket_new = ['fruit', 'freshmeat', 'dairy','MozerallaCheese', 'cannedveg', 'frozenmeal', 'frozendessert',
'PizzaBase', 'TomatoSauce', 'frozen fish', 'softdrink', 'fruitjuice', 'confectionery',
'fish', 'vegetables', 'icecream', 'energydrink', 'tea', 'coffee', 'laundryPowder', 'householCleaners',
'corn chips', 'Frozen yogurt', 'Chocolate', 'Olive Oil', 'Baby Food', 'Napies', 'cat food',
'dog food', 'mince', 'sunflower Oil', 'chicken', 'vitamins', 'deodorants', 'dishwashingliquid', 'onions',
'KitKat', 'TeaTowel', 'Scones']
# Running apriori algorithm
mba_new = apriori(data.loc[:, basket_new], min_support = 0.10, use_colnames = True, low_memory = True)
# Mining rules
# min threshold for lift is 1.
rules_new = association_rules(mba_new, metric = "lift", min_threshold = 1)
# Sorting rules based on support in descending order
rules_by_supp_new = rules_new.sort_values("support", ascending = False)
# Sorting rules based on support in descending order
rules_by_conf_new = rules_new.sort_values("confidence", ascending = False)
# Sorting rules based on support in descending order
rules_by_lift_new = rules_new.sort_values("lift", ascending = False)
rules_by_conf_new[:10]
EXPLANATION
The probabilities that Customers will buy Baby Food are 95.8% and 94.2% when they buy (Napies & TomatoSauce) and (Napies and Olive Oil). Similarly, probabilities that customers will buy Vegetables are 93.9% and 93.5% when they buy (householCleaners, frozenmeal & coffee) and (householCleaners, TomatoSauce & coffee). However, these combinations of products are less likely to occur(10% - 14%).
RULES 1564 and 2126 are rather strong - they both have high lift and confidence indicating that customers who buy antecedent products are 1.9 times more likely to buy the consequent products. RULES 2780 and 2888 are less obvious relationships. Although, the lift for these rules is low, coles could probably increase sales by advertising/marketing householCleaners, coffee, frozenmeal, TomatoSauce with Vegetables(if possible stock them near).
Obvious rules
(Chocolate, Napies) -> (Baby Food).
(Napies, fruit) -> (Baby Food).
==> Parents often buy fruit and chocolates for their young ones.
(Napies, TomatoSauce) -> (Baby Food).
(Napies, Olive Oil) -> (Baby Food).
==> By looking at the rules with Baby Food in consequent(with low support), We can give discounts on Baby Food if customers by (Napies & fruit), (Napies, Chocolate), (Napies, Olive Oil) or (Napies & TomatoSauce) to increase the support of these transactions[If it is not possible to carry out marketing for all the combinations, what we can do is we can give discounts on Baby Food if customer buys combination of products mentioned above. OR coles should make sure that these products are as close as possible to Baby Food.].
rules_by_lift_new[:15]
RULE 1591 - As we can see, Customers who bought Vegetables & Fish are 2.2 times more likely to buy householCleaner. These items were bought together for like 10% of the time but 85% of the customers who bought Vegetables & Fish also bought householCleaners. These three products need to be marketed together or perhaps give discount on householCleaner to the customer who buys Vegetable & Fish.
1. Coles should spatially seperate fish and householCleaner for greater travel distance so that customer will be encouraged to purchase other products.
I. Initialization - First step is to randomly assign centroid for the clusters. This is typically done by randomly choosing K(here 4) points from the input set(we have created 4 centroids it using standard deviation and mean of the data in the above cell).
II. Cluster Assignment - Here each observation(each data point) is assigned to cluster centroid such that is minimum or is maximum. Various metrics can be used to calculate similarities/dissimilarities between data points. This generally means assigning the data point to the closest centroid point. Here I have used Euclidean distance, which goes like following.
$$ d(P, C) = \sqrt{(x_{1} - X)^2 + (y_{1} - Y)^2} $$where $(x_{1}, y_{1})$ is an observation point($P$) and $(X, Y)$ is a centroid point($C$).
III. Centroid Update - After all the points or observations are assigned to the centroids, each centroid is computed again. Here we are using Kmeans that's why new centroids will be computed by averaging(taking mean) the observation or data points which were assigned to the centroids in the II step. [You can see that centroid are being pulled by data points]
IV. Convergence - Step II and III are repeated until the algorithm converges or some criterion is reached. There are several ways to detect convergence. The most typical way is to run until none of the observations change cluster membership(used here).
Extras - Kmeans often gives local minima(not the best solution/not the optimal solution). To overcome this, Kmeans is run for several times each time taking a different set of initial random centroids.
I am going to standardize the variables, because three continuous variables we have here are not on the same scale(not on equal footing). Because income has larger values, it is going to dominate the cluster results and clusters are gonna form based on different values of income. Find the plot in below cell.
# TESTING
test = KMeans(n_clusters = 4)
test_fit = test.fit(data.loc[:,["income", "age", "Value"]])
# Plotting the scatter plot
plt.scatter(data["income"], data["Value"], c = test_fit.labels_)
plt.xlabel("Income in dollars")
plt.ylabel("Value of the transaction")
plt.title("Clusterig W/O Standardizing Variables")
plt.show()
# Plotting the scatter plot
plt.scatter(data["income"], data["age"], c = test_fit.labels_)
plt.ylabel("Age in years")
plt.xlabel("Income in dollars")
plt.title("Clusterig W/O Standardizing Variables")
plt.show()
As we can see there are 4 visible groups in the income, because income has larger values compare to other variables.
# Creating new variable - Z-score of the variables
data["z_income"] = (data["income"] - data["income"].mean())/data["income"].std()
data["z_value"] = (data["Value"] - data["Value"].mean())/data["Value"].std()
data["z_age"] = (data["age"] - data["age"].mean())/data["age"].std()
# Sum of squares
sum_of_square = []
for i in range(1,10):
cluster = KMeans(n_clusters = i)
cluster.fit(data.loc[:, ["z_age", "z_income", "z_value"]])
sum_of_square.append(cluster.inertia_)
The within-cluster sum of squares OR intertia is a measure of the variability of the observations within each cluster. In general, a cluster that has a small sum of squares is more compact than a cluster that has a large sum of squares. Clusters that have higher values exhibit greater variability of the observations within the cluster
# Deciding how many clusters we should create
plt.scatter(range(1,10), sum_of_square)
plt.plot(range(1,10), sum_of_square)
plt.xlabel("Number of Clusters")
plt.ylabel("Withing Sum of Squares")
plt.title("How many Clusters?")
plt.show()
As we can see that there is a clear elbow forming at 4(Optimal within sum of square OR intertia), We are going to choose 4 clusters for this data. We can see that, higher number of clusters have less withing sum of square but it would be hard to interpret that many clusters.
# KMeans performing
cluster = KMeans(n_clusters = 4, random_state = 73)
cluster.fit(data.loc[:, ["z_age", "z_income", "z_value"]])
# Storing the cluster value
data["cluster"] = cluster.labels_
# 4 clusters
cluster_mean = data.groupby("cluster").mean().loc[:,["Value", "income", "age"]]
cluster_mean
# Creating dataframes for individual cluster
cluster1 = data.loc[data["cluster"] == 0, :]
cluster2 = data.loc[data["cluster"] == 1, :]
cluster3 = data.loc[data["cluster"] == 2, :]
cluster4 = data.loc[data["cluster"] == 3, :]
# Getting size of clusters
size_c1 = cluster1.shape[0]
size_c2 = cluster2.shape[0]
size_c3 = cluster3.shape[0]
size_c4 = cluster4.shape[0]
sizes = [size_c1, size_c2, size_c3, size_c4]
## Percentage of clusters
plt.pie(sizes, colors = ["royalblue", "tomato", "seagreen", "yellow"],
labels = ["Cluster1", "Cluster2", "Cluster3", "Cluster4"], autopct = '%1.1f%%', startangle = 90,
pctdistance = 0.80, textprops = {"fontsize" : 19}, explode = (0.03,) * 4)
plt.gca().add_artist(plt.Circle((0, 0), 0.60, fc = 'white'))
plt.title("Cluster Size")
plt.show()
%autoreload 2
# Creating list of list of income for 4 clusters
income_cluster = [cluster1["income"].values, cluster2["income"].values, cluster3["income"].values, cluster4["income"].values]
# Plotting the histogram and boxplot for income
custom_functions.create_cluster_histo_and_box(var_name = "Income", values = income_cluster, xlab = "Income in dollars")
Customers in cluster 4 have the highest median income. Higher variance in income can be seen in cluster 1 also with lowest median income. Cluster 2 and 3 have similar variance and also similiar median income.
%autoreload 2
# Creating list of list of income for 4 clusters
value_cluster = [cluster1["Value"].values, cluster2["Value"].values, cluster3["Value"].values, cluster4["Value"].values]
# Plotting the histogram and boxplot for income
custom_functions.create_cluster_histo_and_box(var_name = "Value", values = value_cluster, xlab = "Value in dollars")
Distribution of transaction value in cluster is almost uniform, that means there are same number of customers in each bin range. While all other clusters have similar distribution which is right skewed. Customers in cluster 2 have higher spending compare to other clusters. Cluster 3 is comprised of the customers spending the lowest. And Cluster 4 has higher variance in terms of spending.
%autoreload 2
# Creating list of list of income for 4 clusters
age_cluster = [cluster1["age"].values, cluster2["age"].values, cluster3["age"].values, cluster4["age"].values]
# Plotting the histogram and boxplot for income
custom_functions.create_cluster_histo_and_box(var_name = "Age", values = age_cluster, xlab = "Age in Years")
Customers in cluster 1 are older compare to customers in other clusters. Cluster 2 and 3 has similar distribution. Cluster 4 has higher variance compare to other clusters. Age range in cluster 4 is 16-75(approx) which suggest we can find customers of all ages in cluster 4.
%autoreload 2
nchildren_cluster = [cluster1["nchildren_processed"].value_counts(), cluster2["nchildren_processed"].value_counts(),
cluster3["nchildren_processed"].value_counts(), cluster4["nchildren_processed"].value_counts()]
custom_functions.create_cluster_bar("No of Children", values = nchildren_cluster, max_possible_val = int(data["nchildren"].max()),
xlab = "No of Children")
Only Clusters 1 & 2 have high number of children(>3). Distribution of children in cluster 1,2 & 3 is same for children less than 4. But for cluster 4, more customers don't have children.
%autoreload 2
sex_clusters = [cluster1["sex_processed"].value_counts()["Male"], cluster2["sex_processed"].value_counts()["Male"],
cluster3["sex_processed"].value_counts()["Male"], cluster4["sex_processed"].value_counts()["Male"]]
custom_functions.create_cluster_stacked_bar(var_name = "Gender", sizes = sizes, values = sex_clusters)
% of female in cluster 1 is the highest among all the clusters, where it is the lowest in the cluster 3.
%autoreload 2
pmethod_cluster = [cluster1["pmethod_processed"].value_counts(), cluster2["pmethod_processed"].value_counts(),
cluster3["pmethod_processed"].value_counts(), cluster4["pmethod_processed"].value_counts()]
custom_functions.create_cluster_donut_chart("Payment Methods", values = pmethod_cluster)
Two most common form of payment methods are Card and EFTPOS in all clusters. 53% of the customers in the cluster 1 used card to make a payment which is highest among all the clusters.
cluster2["parent"].value_counts()
%autoreload 2
parent_clusters = [cluster1["parent"].value_counts()["No"], cluster2["parent"].value_counts()["No"],
cluster3["parent"].value_counts()["No"], cluster4["parent"].value_counts()["No"]]
custom_functions.create_cluster_stacked_bar(var_name = "Parents", sizes = sizes, values = parent_clusters,
labels = ["Yes", "No"], colors_bar = ["royalblue", "mediumseagreen"],
colors_text = ["blue", "green"])
Similar proportions can be seen across the cluster.
%autoreload 2
homeown_cluster = [ cluster1["homeown_processed"].value_counts(),
cluster2["homeown_processed"].value_counts(),
cluster3["homeown_processed"].value_counts(),
cluster4["homeown_processed"].value_counts()]
custom_functions.create_cluster_bar_side("HomeOwn", values = homeown_cluster)
Most of the Customers in Clusters 1,2 & 3 own a house but in Cluster 4 Most of the customers do not own a house.
%autoreload 2
pet_owners_cluster = [cluster1["Pet Owner"].value_counts(), cluster2["Pet Owner"].value_counts(),
cluster3["Pet Owner"].value_counts(), cluster4["Pet Owner"].value_counts()]
custom_functions.create_cluster_donut_chart("Pet Owners", values = pet_owners_cluster, labels = ["No", "Yes"], ncluster = 4,
color = ["royalblue", "tomato"], figsize = (20, 12))
Similar % of the Pet Ownership can be seen across all the clusters in the above chart.
%autoreload 2
ncluster = 4
# Dog owners
dog_owners = [
cluster1["dog food"].value_counts()[1], cluster2["dog food"].value_counts()[1],
cluster3["dog food"].value_counts()[1], cluster4["dog food"].value_counts()[1]
]
# Cat owners
cat_owners = [
cluster1["cat food"].value_counts()[1], cluster2["cat food"].value_counts()[1],
cluster3["cat food"].value_counts()[1], cluster4["cat food"].value_counts()[1]
]
# Owns both cat and dog
flag1 = (cluster1["dog food"].values == 1) & (cluster1["cat food"].values == 1)
flag2 = (cluster2["dog food"].values == 1) & (cluster2["cat food"].values == 1)
flag3 = (cluster3["dog food"].values == 1) & (cluster3["cat food"].values == 1)
flag4 = (cluster4["dog food"].values == 1) & (cluster4["cat food"].values == 1)
both_owners = [flag1.sum(), flag2.sum(), flag3.sum(), flag4.sum()]
#### CORRECTED ####
total = []
for i in range(ncluster):
# Dog owners
dog_owners[i] -= both_owners[i]
# Cat owners
cat_owners[i] -= both_owners[i]
# Total
total.append(dog_owners[i] + cat_owners[i] + both_owners[i])
custom_functions.create_cluster_venn_diagrams(dog_owners, cat_owners, both_owners, total, sizes)
% of customers who own both cat and dog is almost the same(~10%) across the clusters.
I. Cluster 1 - Elderly High Spenders
Basically, this cluster can be seen as less number of people(old woman) earning high and spending more money.
II. Cluster 2 - High Spending Youth
Basically, this cluster can be seen as a moderate number of people earning low and spending more money.
III. Cluster 3 – All age & Broke
Basically, this cluster can be seen as a High number of people(of all age) earning less and spending less money.
IV. Cluster 4 - Rich and Wise
Basically, this cluster can be seen as a Low number of people(of all age) earning more and spending a moderate amount of money.
plt.scatter(data["income"], data["age"], c = data["cluster"])
plt.scatter(data["income"], data["Value"], c = data["cluster"])
data["cluster"] = data["cluster"].astype("category")
fig = px.scatter_3d(data, x = 'income', y = 'Value', z = 'age',
color = "cluster" , opacity = 0.7)
fig.update_traces(marker = dict(size = 8),
selector = dict(mode = 'markers'))
fig.update_layout(margin = dict(l = 1, r = 1, b = 1, t = 1), legend_title = '<b> Cluster </b>')
fig.show()
data_new = data.copy()
# Deleting unused columns
del data_new["PostCode"]
del data_new["ReceiptID"]
# Creating a new file
data_new.to_csv("DATA/Cleaned_Simulated_Coles_data.csv", sep = '\t', index = False)